# install.packages("remotes")
remotes::install_github("MatthewBJane/ThemePark")

library(ThemePark)
head(themepark_themes)
##         theme                     creator
## 1      barbie             Matthew B. Jané
## 2 oppenheimer Matthew B. Jané & Toki Liam
## 3    starwars             Matthew B. Jané
## 4       zelda               Alex Slavenko
## 5  terminator               Alex Slavenko
## 6   spiderman           Velu P.K. Immonen
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggridges)

knitr::opts_chunk$set(
  fig.width = 6,
  fig.asp = .6,
  out.width = "90%"
)

weather_df = 
  rnoaa::meteo_pull_monitors(
    c("USW00094728", "USW00022534", "USS0023B17S"),
    var = c("PRCP", "TMIN", "TMAX"), 
    date_min = "2021-01-01",
    date_max = "2022-12-31") |>
  mutate(
    name = recode(
      id, 
      USW00094728 = "CentralPark_NY", 
      USW00022534 = "Molokai_HI",
      USS0023B17S = "Waterhole_WA"),
    tmin = tmin / 10,
    tmax = tmax / 10) |>
  select(name, id, everything())
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00094728.dly
## date created (size, mb): 2023-09-28 10:19:14.527811 (8.524)
## file min/max dates: 1869-01-01 / 2023-09-30
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00022534.dly
## date created (size, mb): 2023-09-28 10:19:24.312619 (3.83)
## file min/max dates: 1949-10-01 / 2023-09-30
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USS0023B17S.dly
## date created (size, mb): 2023-09-28 10:19:23.539384 (0.994)
## file min/max dates: 1999-09-01 / 2023-09-30

Class 1: Basic scatterplot

ggplot(weather_df, aes(x = tmin, y = tmax)) +
  geom_point()
## Warning: Removed 17 rows containing missing values (`geom_point()`).

You can also take/start with the df and pipe the scatterplot, and it will create exactly the same plot:

weather_df |>
  ggplot(aes(x = tmin, y = tmax)) +
  geom_point()
## Warning: Removed 17 rows containing missing values (`geom_point()`).

Piping may make filtering easier. You don’t have to create a separate df just for New York, for example:

nyc_weather =
  weather_df |>
  filter(name == "CentralParkNY") |>
  ggplot(aes(x = tmin, y = tmax)) +
  geom_point()

nyc_weather + geom_point()

Fancy plot

color can be added to the entire plot in ggplot or to the points in geom_point

geom_smooth adds a smooth curve

se = FALSE removes standard error bars from the curve

alpha blending/shading make the points more transparent (0.3 = 30% solid 70% opaque)

ggplot(weather_df, aes(x = tmin, y = tmax)) +
  geom_point(aes(color = name), alpha = 0.3) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).

Plot with facets:

ggplot(weather_df, aes(x = tmin, y = tmax, color = name)) +
  geom_point(alpha = 0.3) +
  geom_smooth() +
  facet_grid(. ~ name)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).

Another fancy plot:

size = precipitation (higher precipitation = larger size)

ggplot(weather_df, aes(x = date, y = tmax, color = name)) +
  geom_point(aes(size = prcp), alpha = 0.3, size = 0.5) +
  geom_smooth() +
  facet_grid(. ~ name)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).

Assigning specific colors to specific things:

Need to assign the color in geom_point, not ggplot because aesthetic mappings takes variables from your dataset and maps them onto colors

weather_df |>
  filter(name == "CentralPark_NY") |>
  ggplot(aes(x = date, y = tmax)) +
  geom_point(color = "blue")

Hex plot:

weather_df |>
  ggplot(aes(x = tmin, y = tmax)) +
  geom_hex()
## Warning: Removed 17 rows containing non-finite values (`stat_binhex()`).

Line plot:

weather_df |>
  filter(name == "Molokai_HI") |>
  ggplot(aes(x = date, y = tmax)) +
  geom_line()

Can combine line plot with points:

weather_df |>
  filter(name == "Molokai_HI") |>
  ggplot(aes(x = date, y = tmax)) +
  geom_line(alpha = 0.3) +
  geom_point(size = 0.3)
## Warning: Removed 1 rows containing missing values (`geom_point()`).

Univariate plotting

Easiest starting point for this is a basic histogram:

ggplot(weather_df, aes(x = tmax)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).

fill argument fills in colors for histogram

position = “dodge” places bars for each group side-by-side - bars should avoid each other rather than stack up on each other (however, this can get difficult –> easier to use density plots)

ggplot(weather_df, aes(x = tmax, fill = name)) +
  geom_histogram(position = "dodge", binwidth = 2)
## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).

Density plot:

adjust argument is similar to binwidth in histograms

if you over-smooth or under-smooth, you may miss the message you’re trying to make

density plots show more distribution than box plots

ggplot(weather_df, aes(x = tmax, fill = name)) +
  geom_density(alpha = 0.3, adjust = 2)
## Warning: Removed 17 rows containing non-finite values (`stat_density()`).

Basic boxplots:

ggplot(weather_df, aes(y = tmax)) +
  geom_boxplot()
## Warning: Removed 17 rows containing non-finite values (`stat_boxplot()`).

Can compare by group, e.g., name:

ggplot(weather_df, aes(x = name, y = tmax)) +
  geom_boxplot()
## Warning: Removed 17 rows containing non-finite values (`stat_boxplot()`).

Violin plot:

ggplot(weather_df, aes(x = name, y = tmax)) +
  geom_violin()
## Warning: Removed 17 rows containing non-finite values (`stat_ydensity()`).

Ridge plot:

ggplot(weather_df, aes(x = tmax, y = name)) +
  geom_density_ridges(scale = 0.9)
## Picking joint bandwidth of 1.54
## Warning: Removed 17 rows containing non-finite values
## (`stat_density_ridges()`).

Saving and embedding plots

ggp_weather =
  ggplot(weather_df, aes(x = tmin, y = tmax)) +
  geom_point(aes(color = name), alpha = 0.5)

ggsave("ggp_weather.pdf", ggp_weather, width = 8, height = 5)
## Warning: Removed 17 rows containing missing values (`geom_point()`).
ggp_weather
## Warning: Removed 17 rows containing missing values (`geom_point()`).

You can also set options globally in the beginning after loading packages with this code:

knitr::opts_chunk$set( fig.width = 6, fig.asp = .6, out.width = “90%” )

fig.asp = aspect ratio

Class 2: Same plot from last time

labs(x or y) renames axis on the graph labs(color) renames the legend in this case since the colors are the legend labs(title) adds a header at the top labs(caption) adds a caption at the bottom

scale_x_continuous(breaks = ) tells you where you want the tick marks on the x axis (labels = ) helps you label the x axis ticks can do scale_y_continuous (position = “right”) moves the y axis to the right side of the graph (trans = “sqrt”) transforms the data (limits = c(0, 30)) zooms into/scales the graph to fit 0-30 instead of the default -15 to 40 range

weather_df |>
  ggplot(aes(x = tmin, y = tmax, color = name)) +
  geom_point(alpha = 0.5) +
  labs(
    x = "Min daily temp (Degrees C)",
    y = "Max daily temp",
    color = "Location",
    title = "Temperature plot",
    caption = "The data was retreieved from moaa"
  ) +
  scale_x_continuous(
    breaks = c(-15, 0, 15),
    labels = c("-15 C", "0 C", "15 C")
  ) +
  scale_y_continuous(
    position = "right",
    limits = c(0, 30)
  )
## Warning: Removed 302 rows containing missing values (`geom_point()`).

Changing colors

there are a lot of different scales you can adjust under scale_color

scale_color_hue works well for categorical variables e.g., + scale_color_hue(h = c(100, 300))

viridis package is good for colors option = “magma” because magma is one of the color scales besides viridis, for example discrete = TRUE relates to variable type (categorical vs. continuous)

weather_df |>
  ggplot(aes(x = tmin, y = tmax, color = name)) +
  geom_point(alpha = 0.5) +
  labs(
    x = "Min daily temp (Degrees C)",
    y = "Max daily temp",
    color = "Location",
    title = "Temperature plot",
    caption = "The data was retreieved from moaa"
  ) +
  viridis::scale_color_viridis(discrete = TRUE)
## Warning: Removed 17 rows containing missing values (`geom_point()`).

other chart details:

theme_bw() inverts the default grey background to a white background be careful about the order in which you put this in - it can be a reset so try to put it high in order

theme_classic() is like theme_bw() hides gridlines and a thick border

theme_minimal() is my personal favorite

weather_df |>
  ggplot(aes(x = tmin, y = tmax, color = name)) +
  geom_point(alpha = 0.5) +
  labs(
    x = "Min daily temp (Degrees C)",
    y = "Max daily temp",
    color = "Location",
    title = "Temperature plot",
    caption = "The data was retreieved from moaa"
  ) +
  viridis::scale_color_viridis(discrete = TRUE) +
  theme_minimal() +
  theme(legend.position = "bottom")
## Warning: Removed 17 rows containing missing values (`geom_point()`).

weather_df |>
  ggplot(aes(x = date, y = tmax)) +
  geom_point(aes(color = name)) +
  geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 17 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 17 rows containing missing values (`geom_point()`).

nyc_weather_df =
  weather_df |>
  filter(name == "CentralPark_NY")

hawaii_weather_df =
  weather_df |>
  filter(name == "Molokai_HI")

ggplot(nyc_weather_df, aes(x = date, y = tmax)) +
  geom_point() +
  geom_line(data = hawaii_weather_df) +
  ThemePark::theme_barbie() +
  labs(
    title = "I made this barbie-themed plot for you"
  )

Patchwork

faceting - I want this plot duplicated for another related variable not good if you want two completely different plots

weather_df |>
  ggplot(aes(x = date, y = tmax, color = name)) +
  geom_point() +
  facet_grid(. ~ name) +
  ThemePark::theme_barbie() +
  labs(
    title = "this one is slightly cuter"
  )
## Warning: Removed 17 rows containing missing values (`geom_point()`).

to put them side by side:

ggp_temp_scatter =
  weather_df |>
  ggplot(aes(x = tmin, y = tmax, color = name)) +
  geom_point(alpha = .5)

ggp_precip_density =
  weather_df |>
  ggplot(aes(x = prcp, color = name)) +
  geom_density()

Data manipulation

factor variables are thought of as categorical variables with order/levels, whereas characters are just characters with no structure this is important now because when ggplot has to figure out what goes onto which axis and the order of colors, it converts it to a factor variable in the background and by default uses alphabetical order need to change the variable structure to change the other

fct_reorder function lets you put name in order of some other variable fct_relevel is manual?

weather_df |>
  mutate(
    name = fct_relevel(name, c("Molokai_HI", "CentralPark_NY", "Waterhole_MA"))
  ) |>
  ggplot(aes(x = name, y = tmax)) +
  geom_boxplot()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `name = fct_relevel(name, c("Molokai_HI", "CentralPark_NY",
##   "Waterhole_MA"))`.
## Caused by warning:
## ! 1 unknown level in `f`: Waterhole_MA
## Warning: Removed 17 rows containing non-finite values (`stat_boxplot()`).

weather_df |>
  mutate(
    name = fct_reorder(name, tmax)
    ) |>
  ggplot(aes(x = name, y = tmax, fill = name)) +
  geom_violin()
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `name = fct_reorder(name, tmax)`.
## Caused by warning:
## ! `fct_reorder()` removing 17 missing values.
## ℹ Use `.na_rm = TRUE` to silence this message.
## ℹ Use `.na_rm = FALSE` to preserve NAs.
## Warning: Removed 17 rows containing non-finite values (`stat_ydensity()`).

Complex facet grid

litters_df =
  read_csv("FAS_litters.csv") |>
  janitor::clean_names() |>
  separate(group, into = c("dose", "day_of_treatment"), sep = 3)
## Rows: 49 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Group, Litter Number
## dbl (6): GD0 weight, GD18 weight, GD of Birth, Pups born alive, Pups dead @ ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pups_df =
  read_csv("FAS_pups.csv") |>
  janitor::clean_names()
## Rows: 313 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Litter Number
## dbl (5): Sex, PD ears, PD eyes, PD pivot, PD walk
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
FAS_df =
  left_join(pups_df, litters_df, by = "litter_number")

FAS_df |>
  select(dose, day_of_treatment, starts_with("pd")) |>
  pivot_longer(
    pd_ears:pd_walk,
    names_to = "outcome",
    values_to = "postnatal_day"
  ) |>
  drop_na() |>
  mutate(
    outcome =
      fct_reorder(outcome, postnatal_day)
  ) |>
  ggplot(aes(x = dose, y = postnatal_day)) +
  geom_violin() +
  facet_grid(day_of_treatment ~ outcome)

Session 10

lubirdate: handy way of accessing month as a variable

weather_df = 
  rnoaa::meteo_pull_monitors(
    c("USW00094728", "USW00022534", "USS0023B17S"),
    var = c("PRCP", "TMIN", "TMAX"), 
    date_min = "2021-01-01",
    date_max = "2022-12-31") |>
  mutate(
    name = recode(
      id, 
      USW00094728 = "CentralPark_NY", 
      USW00022534 = "Molokai_HI",
      USS0023B17S = "Waterhole_WA"),
    tmin = tmin / 10,
    tmax = tmax / 10,
    month = lubridate::floor_date(date, unit = "month")) |>
  select(name, id, everything())
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00094728.dly
## date created (size, mb): 2023-09-28 10:19:14.527811 (8.524)
## file min/max dates: 1869-01-01 / 2023-09-30
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USW00022534.dly
## date created (size, mb): 2023-09-28 10:19:24.312619 (3.83)
## file min/max dates: 1949-10-01 / 2023-09-30
## using cached file: /Users/sarahyounes/Library/Caches/org.R-project.R/R/rnoaa/noaa_ghcnd/USS0023B17S.dly
## date created (size, mb): 2023-09-28 10:19:23.539384 (0.994)
## file min/max dates: 1999-09-01 / 2023-09-30
weather_df |>
  ggplot(aes(x = prcp)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 15 rows containing non-finite values (`stat_bin()`).

here are the big outliers:

weather_df |>
  filter(prcp > 1000)
## # A tibble: 3 × 7
##   name           id          date        prcp  tmax  tmin month     
##   <chr>          <chr>       <date>     <dbl> <dbl> <dbl> <date>    
## 1 CentralPark_NY USW00094728 2021-08-21  1130  27.8  22.8 2021-08-01
## 2 CentralPark_NY USW00094728 2021-09-01  1811  25.6  17.2 2021-09-01
## 3 Molokai_HI     USW00022534 2022-12-18  1120  23.3  18.9 2022-12-01
weather_df |>
  filter(tmax >= 20, tmax <= 30) |>
  ggplot(aes(x = tmin, y = tmax, color = name)) +
  geom_point()

if you run this, you’ll notice that grouping is sort of invisible only hint below is # groups: name comment

(n_obs = n()) tells you to give you the number of observations good way to determine sample size in groups you care about

weather_df |>
  group_by(name, month) |>
  summarize(n_obs = n())
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
## # A tibble: 72 × 3
## # Groups:   name [3]
##    name           month      n_obs
##    <chr>          <date>     <int>
##  1 CentralPark_NY 2021-01-01    31
##  2 CentralPark_NY 2021-02-01    28
##  3 CentralPark_NY 2021-03-01    31
##  4 CentralPark_NY 2021-04-01    30
##  5 CentralPark_NY 2021-05-01    31
##  6 CentralPark_NY 2021-06-01    30
##  7 CentralPark_NY 2021-07-01    31
##  8 CentralPark_NY 2021-08-01    31
##  9 CentralPark_NY 2021-09-01    30
## 10 CentralPark_NY 2021-10-01    31
## # ℹ 62 more rows

can also count number of names (and edit the name)

weather_df |>
  count(name, name = "n_obs")
## # A tibble: 3 × 2
##   name           n_obs
##   <chr>          <int>
## 1 CentralPark_NY   730
## 2 Molokai_HI       730
## 3 Waterhole_WA     730
weather_df |>
  count(name, month) |>
  pivot_wider(
    names_from = name,
    values_from = n
  )
## # A tibble: 24 × 4
##    month      CentralPark_NY Molokai_HI Waterhole_WA
##    <date>              <int>      <int>        <int>
##  1 2021-01-01             31         31           31
##  2 2021-02-01             28         28           28
##  3 2021-03-01             31         31           31
##  4 2021-04-01             30         30           30
##  5 2021-05-01             31         31           31
##  6 2021-06-01             30         30           30
##  7 2021-07-01             31         31           31
##  8 2021-08-01             31         31           31
##  9 2021-09-01             30         30           30
## 10 2021-10-01             31         31           31
## # ℹ 14 more rows

General summaries

can take us beyond simply counting

2 ways to remove NA:

weather_df |>
  drop_na(tmax) |>
  group_by(name) |>
  summarize(
    mean_tmax = mean(tmax),
    median_tmax = median(tmax),
    sd_tmax = sd(tmax)
  )
## # A tibble: 3 × 4
##   name           mean_tmax median_tmax sd_tmax
##   <chr>              <dbl>       <dbl>   <dbl>
## 1 CentralPark_NY     17.7         18.9    9.96
## 2 Molokai_HI         28.3         28.3    1.80
## 3 Waterhole_WA        7.38         6.1    7.55
weather_df |>
  group_by(name) |>
  summarize(
    mean_tmax = mean(tmax, na.rm = TRUE),
    median_tmax = median(tmax, na.rm = TRUE),
    sd_tmax = sd(tmax, na.rm = TRUE)
  )
## # A tibble: 3 × 4
##   name           mean_tmax median_tmax sd_tmax
##   <chr>              <dbl>       <dbl>   <dbl>
## 1 CentralPark_NY     17.7         18.9    9.96
## 2 Molokai_HI         28.3         28.3    1.80
## 3 Waterhole_WA        7.38         6.1    7.55
weather_df |>
  group_by(name, month) |>
  summarize(mean_tmax = mean(tmax, na.rm = TRUE)) |>
  ggplot(aes(x = month, y = mean_tmax, color = name)) +
  geom_point() +
  geom_line()
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.

more pivot wider:

weather_df |>
  group_by(name, month) |>
  summarize(mean_tmax = mean(tmax, na.rm = TRUE)) |>
  pivot_wider(
    names_from = name,
    values_from = mean_tmax
  )
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
## # A tibble: 24 × 4
##    month      CentralPark_NY Molokai_HI Waterhole_WA
##    <date>              <dbl>      <dbl>        <dbl>
##  1 2021-01-01           4.27       27.6        0.8  
##  2 2021-02-01           3.87       26.4       -0.786
##  3 2021-03-01          12.3        25.9        2.62 
##  4 2021-04-01          17.6        26.6        6.10 
##  5 2021-05-01          22.1        28.6        8.20 
##  6 2021-06-01          28.1        29.6       15.3  
##  7 2021-07-01          28.4        30.0       17.3  
##  8 2021-08-01          28.8        29.5       17.2  
##  9 2021-09-01          24.8        29.7       12.6  
## 10 2021-10-01          19.9        29.1        5.48 
## # ℹ 14 more rows

sometimes it’s nice to format things as actual tables, especially if you’re sending to others: use knitr packages kable function

weather_df |>
  group_by(name, month) |>
  summarize(mean_tmax = mean(tmax, na.rm = TRUE)) |>
  pivot_wider(
    names_from = name,
    values_from = mean_tmax
  ) |>
  knitr::kable(digits = 2)
## `summarise()` has grouped output by 'name'. You can override using the
## `.groups` argument.
month CentralPark_NY Molokai_HI Waterhole_WA
2021-01-01 4.27 27.62 0.80
2021-02-01 3.87 26.37 -0.79
2021-03-01 12.29 25.86 2.62
2021-04-01 17.61 26.57 6.10
2021-05-01 22.08 28.58 8.20
2021-06-01 28.06 29.59 15.25
2021-07-01 28.35 29.99 17.34
2021-08-01 28.81 29.52 17.15
2021-09-01 24.79 29.67 12.65
2021-10-01 19.93 29.13 5.48
2021-11-01 11.54 28.85 3.53
2021-12-01 9.59 26.19 -2.10
2022-01-01 2.85 26.61 3.61
2022-02-01 7.65 26.83 2.99
2022-03-01 11.99 27.73 3.42
2022-04-01 15.81 27.72 2.46
2022-05-01 22.25 28.28 5.81
2022-06-01 26.09 29.16 11.13
2022-07-01 30.72 29.53 15.86
2022-08-01 30.50 30.70 18.83
2022-09-01 24.92 30.41 15.21
2022-10-01 17.43 29.22 11.88
2022-11-01 14.02 27.96 2.14
2022-12-01 6.76 27.35 -0.46

Grouped mutate

can compute the month-level mean instead of one mean for the entire dataset, for example

weather_df |>
  group_by(name) |>
  mutate(
    mean_tmax = mean(tmax, na.rm = TRUE),
    centered_tmax = tmax - mean_tmax
  ) |>
  ggplot(aes(x = date, y = centered_tmax, color = name)) +
  geom_point()
## Warning: Removed 17 rows containing missing values (`geom_point()`).

min_rank() gives the ranking from lowest to highest min_rank(desc()) puts it in descending/reverse order filter for the coldest days in each month

weather_df |>
  group_by(name, month) |>
  mutate(tmax_rank = min_rank(tmax)) |>
  filter(tmax_rank < 2)
## # A tibble: 92 × 8
## # Groups:   name, month [72]
##    name           id          date        prcp  tmax  tmin month      tmax_rank
##    <chr>          <chr>       <date>     <dbl> <dbl> <dbl> <date>         <int>
##  1 CentralPark_NY USW00094728 2021-01-29     0  -3.8  -9.9 2021-01-01         1
##  2 CentralPark_NY USW00094728 2021-02-08     0  -1.6  -8.2 2021-02-01         1
##  3 CentralPark_NY USW00094728 2021-03-02     0   0.6  -6   2021-03-01         1
##  4 CentralPark_NY USW00094728 2021-04-02     0   3.9  -2.1 2021-04-01         1
##  5 CentralPark_NY USW00094728 2021-05-29   117  10.6   8.3 2021-05-01         1
##  6 CentralPark_NY USW00094728 2021-05-30   226  10.6   8.3 2021-05-01         1
##  7 CentralPark_NY USW00094728 2021-06-11     0  20.6  16.7 2021-06-01         1
##  8 CentralPark_NY USW00094728 2021-06-12     0  20.6  16.7 2021-06-01         1
##  9 CentralPark_NY USW00094728 2021-07-03    86  18.9  15   2021-07-01         1
## 10 CentralPark_NY USW00094728 2021-08-04     0  24.4  19.4 2021-08-01         1
## # ℹ 82 more rows

lag(): for example, how does yesterday’s temperature relate to today’s temperature/how does previous temperature impact current/future temperatures lag(tmax, 3) would go 3 rows back if you forget your grouping, you will get random values

weather_df |>
  group_by(name) |>
  mutate(
    yesterday_tmax = lag(tmax)
  )
## # A tibble: 2,190 × 8
## # Groups:   name [3]
##    name           id      date        prcp  tmax  tmin month      yesterday_tmax
##    <chr>          <chr>   <date>     <dbl> <dbl> <dbl> <date>              <dbl>
##  1 CentralPark_NY USW000… 2021-01-01   157   4.4   0.6 2021-01-01           NA  
##  2 CentralPark_NY USW000… 2021-01-02    13  10.6   2.2 2021-01-01            4.4
##  3 CentralPark_NY USW000… 2021-01-03    56   3.3   1.1 2021-01-01           10.6
##  4 CentralPark_NY USW000… 2021-01-04     5   6.1   1.7 2021-01-01            3.3
##  5 CentralPark_NY USW000… 2021-01-05     0   5.6   2.2 2021-01-01            6.1
##  6 CentralPark_NY USW000… 2021-01-06     0   5     1.1 2021-01-01            5.6
##  7 CentralPark_NY USW000… 2021-01-07     0   5    -1   2021-01-01            5  
##  8 CentralPark_NY USW000… 2021-01-08     0   2.8  -2.7 2021-01-01            5  
##  9 CentralPark_NY USW000… 2021-01-09     0   2.8  -4.3 2021-01-01            2.8
## 10 CentralPark_NY USW000… 2021-01-10     0   5    -1.6 2021-01-01            2.8
## # ℹ 2,180 more rows

show the day-to-day variation in temperature:

weather_df |>
  group_by(name) |>
  mutate(temp_change = tmax - lag(tmax)) |>
  summarize(
    sd_temp_change = sd(temp_change, na.rm = TRUE)
  )
## # A tibble: 3 × 2
##   name           sd_temp_change
##   <chr>                   <dbl>
## 1 CentralPark_NY           4.43
## 2 Molokai_HI               1.24
## 3 Waterhole_WA             3.04